Here are the packages used in this session:

# REMOVE ALL OBJECTS
rm(list=ls())

# access the MASS package
library(MASS)
library(corrplot)
library(tidyverse)
library(ggplot2)
library(GGally)
library(plotly)

Data

Let’s load the Boston data from the MASS package. Data is all about the “Housing Values in Suburbs of Boston”. More about the data can be read from here.

# load the data
data("Boston")

# explore the dataset
str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08204   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00
# plot matrix of the variables
ggpairs(Boston, lower = "blank", upper = list(continuous = "points", combo =
  "facethist", discrete = "facetbar", na = "na"))

Boston dataset has 506 observations and 14 variables. All variables are in numeric format. Variable chas is a dummy variable (Charles River dummy variable, 1 if tract bounds river and 0 otherwise) and variable rad is an index (index of accessibility to radial highways).

Only by waching the plot above, it looks that the data has some linear relations. These relations can bee seen with correlation matrix.

# calculate the correlation matrix and round it
cor_matrix<-cor(Boston) 

# print the correlation matrix
cor_matrix %>% round(digits = 2)
##          crim    zn indus  chas   nox    rm   age   dis   rad   tax
## crim     1.00 -0.20  0.41 -0.06  0.42 -0.22  0.35 -0.38  0.63  0.58
## zn      -0.20  1.00 -0.53 -0.04 -0.52  0.31 -0.57  0.66 -0.31 -0.31
## indus    0.41 -0.53  1.00  0.06  0.76 -0.39  0.64 -0.71  0.60  0.72
## chas    -0.06 -0.04  0.06  1.00  0.09  0.09  0.09 -0.10 -0.01 -0.04
## nox      0.42 -0.52  0.76  0.09  1.00 -0.30  0.73 -0.77  0.61  0.67
## rm      -0.22  0.31 -0.39  0.09 -0.30  1.00 -0.24  0.21 -0.21 -0.29
## age      0.35 -0.57  0.64  0.09  0.73 -0.24  1.00 -0.75  0.46  0.51
## dis     -0.38  0.66 -0.71 -0.10 -0.77  0.21 -0.75  1.00 -0.49 -0.53
## rad      0.63 -0.31  0.60 -0.01  0.61 -0.21  0.46 -0.49  1.00  0.91
## tax      0.58 -0.31  0.72 -0.04  0.67 -0.29  0.51 -0.53  0.91  1.00
## ptratio  0.29 -0.39  0.38 -0.12  0.19 -0.36  0.26 -0.23  0.46  0.46
## black   -0.39  0.18 -0.36  0.05 -0.38  0.13 -0.27  0.29 -0.44 -0.44
## lstat    0.46 -0.41  0.60 -0.05  0.59 -0.61  0.60 -0.50  0.49  0.54
## medv    -0.39  0.36 -0.48  0.18 -0.43  0.70 -0.38  0.25 -0.38 -0.47
##         ptratio black lstat  medv
## crim       0.29 -0.39  0.46 -0.39
## zn        -0.39  0.18 -0.41  0.36
## indus      0.38 -0.36  0.60 -0.48
## chas      -0.12  0.05 -0.05  0.18
## nox        0.19 -0.38  0.59 -0.43
## rm        -0.36  0.13 -0.61  0.70
## age        0.26 -0.27  0.60 -0.38
## dis       -0.23  0.29 -0.50  0.25
## rad        0.46 -0.44  0.49 -0.38
## tax        0.46 -0.44  0.54 -0.47
## ptratio    1.00 -0.18  0.37 -0.51
## black     -0.18  1.00 -0.37  0.33
## lstat      0.37 -0.37  1.00 -0.74
## medv      -0.51  0.33 -0.74  1.00
# visualize the correlation matrix
##corrplot(cor_matrix, method="circle", type = "upper", cl.pos = "b", tl.pos = "d", tl.cex = 0.6)
corrplot(cor_matrix, method="circle", tl.pos = "d")

Some high correlations between variables can be seen, for example between index of accessibility to radial highways and full-value property-tax rate per $10,000 (0.91), proportion of owner-occupied units built prior to 1940 and weighted mean of distances to five Boston employment centres (-0.75) and nitrogen oxides concentration and proportion of non-retail business acres per town (-0.77).

LDA

For further analysis, we will standardize the data. Let’s have a same kind of lookup in the data after we have standadized the data.

# center and standardize variables
boston_std <- scale(Boston)

# summaries of the scaled variables
summary(boston_std)
##       crim                 zn               indus        
##  Min.   :-0.419367   Min.   :-0.48724   Min.   :-1.5563  
##  1st Qu.:-0.410563   1st Qu.:-0.48724   1st Qu.:-0.8668  
##  Median :-0.390280   Median :-0.48724   Median :-0.2109  
##  Mean   : 0.000000   Mean   : 0.00000   Mean   : 0.0000  
##  3rd Qu.: 0.007389   3rd Qu.: 0.04872   3rd Qu.: 1.0150  
##  Max.   : 9.924110   Max.   : 3.80047   Max.   : 2.4202  
##       chas              nox                rm               age         
##  Min.   :-0.2723   Min.   :-1.4644   Min.   :-3.8764   Min.   :-2.3331  
##  1st Qu.:-0.2723   1st Qu.:-0.9121   1st Qu.:-0.5681   1st Qu.:-0.8366  
##  Median :-0.2723   Median :-0.1441   Median :-0.1084   Median : 0.3171  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.:-0.2723   3rd Qu.: 0.5981   3rd Qu.: 0.4823   3rd Qu.: 0.9059  
##  Max.   : 3.6648   Max.   : 2.7296   Max.   : 3.5515   Max.   : 1.1164  
##       dis               rad               tax             ptratio       
##  Min.   :-1.2658   Min.   :-0.9819   Min.   :-1.3127   Min.   :-2.7047  
##  1st Qu.:-0.8049   1st Qu.:-0.6373   1st Qu.:-0.7668   1st Qu.:-0.4876  
##  Median :-0.2790   Median :-0.5225   Median :-0.4642   Median : 0.2746  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.6617   3rd Qu.: 1.6596   3rd Qu.: 1.5294   3rd Qu.: 0.8058  
##  Max.   : 3.9566   Max.   : 1.6596   Max.   : 1.7964   Max.   : 1.6372  
##      black             lstat              medv        
##  Min.   :-3.9033   Min.   :-1.5296   Min.   :-1.9063  
##  1st Qu.: 0.2049   1st Qu.:-0.7986   1st Qu.:-0.5989  
##  Median : 0.3808   Median :-0.1811   Median :-0.1449  
##  Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.4332   3rd Qu.: 0.6024   3rd Qu.: 0.2683  
##  Max.   : 0.4406   Max.   : 3.5453   Max.   : 2.9865
# class of the boston_scaled object
class(boston_std)
## [1] "matrix"
# change the object to data frame
boston_scaled <- as.data.frame(boston_std)

From summary we can see that we have succesfully standardized the data. All the variables have mean zero and deviation is standardized. Let’s analyse the crime rate variable deeper.

summary(boston_scaled$crim)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.419367 -0.410563 -0.390280  0.000000  0.007389  9.924110
# create a quantile vector of crim and print it
bins <- quantile(boston_scaled$crim)
bins
##           0%          25%          50%          75%         100% 
## -0.419366929 -0.410563278 -0.390280295  0.007389247  9.924109610
# create a categorical variable 'crime'
crime <- cut(boston_scaled$crim, breaks = bins, include.lowest = TRUE, label = c("low", "med_low", "med_high", "high"))

# look at the table of the new factor crime
table(crime)
## crime
##      low  med_low med_high     high 
##      127      126      126      127
# remove original crim from the dataset
boston_scaled <- dplyr::select(boston_scaled, -crim)

# add the new categorical value to scaled data
boston_scaled <- data.frame(boston_scaled, crime)

As wee see from crime table above, all quantiles have same number of samples as it should be. Let’s divide the dataset to train and test sets, so that 80% of the data belongs to the train set.

# number of rows in the Boston dataset 
n <- nrow(boston_scaled)

# choose randomly 80% of the rows
ind <- sample(n,  size = n * 0.8)

# create train set
train <- boston_scaled[ind,]

# create test set 
test <- boston_scaled[-ind,]

# save the correct classes from test data
correct_classes <- test$crime

# remove the crime variable from test data
test <- dplyr::select(test, -crime)

Next we will fit the linear discriminant analysis on the train set. We use the categorical crime rate as the target variable and all the other variables in the dataset as predictor variables.

# linear discriminant analysis
lda.fit <- lda(crime ~ ., data = train)

# print the lda.fit object
lda.fit
## Call:
## lda(crime ~ ., data = train)
## 
## Prior probabilities of groups:
##       low   med_low  med_high      high 
## 0.2475248 0.2400990 0.2574257 0.2549505 
## 
## Group means:
##                   zn      indus        chas        nox          rm
## low       1.01195889 -0.8914962 -0.15421606 -0.8915182  0.41040219
## med_low  -0.06664642 -0.2512232  0.01179157 -0.5801995 -0.12625162
## med_high -0.40313503  0.2526151  0.18195173  0.3834211  0.03930393
## high     -0.48724019  1.0149946 -0.04298342  1.0447415 -0.41213930
##                 age        dis        rad        tax     ptratio
## low      -0.8847214  0.8631522 -0.6752305 -0.7191125 -0.46908044
## med_low  -0.3324247  0.3468618 -0.5568200 -0.4642744 -0.05136559
## med_high  0.4275041 -0.3667114 -0.4186806 -0.2844423 -0.22595832
## high      0.8309102 -0.8622126  1.6596029  1.5294129  0.80577843
##                black      lstat         medv
## low       0.37546437 -0.7460324  0.497242788
## med_low   0.34641451 -0.1512194  0.002149698
## med_high  0.06043611  0.0294901  0.113631234
## high     -0.82143789  0.9643592 -0.764781732
## 
## Coefficients of linear discriminants:
##                 LD1          LD2         LD3
## zn       0.10565610  0.770527031 -0.93871487
## indus   -0.03103631 -0.201275667  0.36106274
## chas    -0.08602528  0.009090412  0.16620451
## nox      0.28863128 -0.905194999 -1.39546007
## rm      -0.10036228 -0.030896885 -0.09011387
## age      0.30547011 -0.340909369 -0.12420796
## dis     -0.10143331 -0.483280011  0.21932227
## rad      3.41506444  0.962042948 -0.01801627
## tax      0.03161503 -0.056875892  0.52783841
## ptratio  0.15602545 -0.029744400 -0.26720231
## black   -0.15512039  0.035588268  0.17170765
## lstat    0.15887296 -0.235443319  0.28830115
## medv     0.13555355 -0.493785831 -0.30017110
## 
## Proportion of trace:
##    LD1    LD2    LD3 
## 0.9533 0.0363 0.0104
# the function for lda biplot arrows
lda.arrows <- function(x, myscale = 1, arrow_heads = 0.1, color = "red", tex = 0.75, choices = c(1,2)){
  heads <- coef(x)
  arrows(x0 = 0, y0 = 0, 
         x1 = myscale * heads[,choices[1]], 
         y1 = myscale * heads[,choices[2]], col="orange", length = arrow_heads)
  text(myscale * heads[,choices], labels = row.names(heads), 
       cex = tex, col= "orange", pos=3)
}

# target classes as numeric
classes <- as.numeric(train$crime)

# plot the lda results
plot(lda.fit, dimen = 2, col = classes, pch = classes)
lda.arrows(lda.fit, myscale = 1)

From the plot and summary above we can see that variable index of accessibility to radial highways has the greatest seperation weight related to other variables, both in LD1 and LD2 dimensions. In LDI coefficient for index of accessibility to radial highways is 3.4 when second greatest weight, 0.37, is variables nitrogen oxides concentration. In dimension LD2 variable index of accessibility to radial highways does not have such a dominance as in LD1. In both dimensions it is positively correlated with crime rate. In LD2 coefficient for it is 0.9, when for example variables proportion of residential land zoned for lots over 25,000 sq.ft. and nitrogen oxides concentration (parts per 10 million) have both coefficient around 0.7 as in absolute value, but these LD2 components have oppposite directions (nitrogen oxides concentration (parts per 10 million) increases criminal rate). In our LD1-LD2 map moving south-east inreases the criminal rate.

For next we will test how well our model predicts the crime rate. Earlier in our analysis we separated our data in learning and testing parts and we removed categorial variable crime from test data.

# predict classes with test data
lda.pred <- predict(lda.fit, newdata = test)

# cross tabulate the results
table(correct = correct_classes, predicted = lda.pred$class)
##           predicted
## correct    low med_low med_high high
##   low       13      12        2    0
##   med_low    2      21        6    0
##   med_high   1       6       14    1
##   high       0       0        1   23

In table above we see that our model predicts high crime rate very well. Also in other rate classes our model makes quite good predictions, even in classes med_high and med_low.

K-means clustering

Let’s use original standardized data with original crime rate variable. Then we will alalyse data with K-means clustering method.

boston_std <- as.data.frame(boston_std)


boston_dist <- dist(boston_std)

set.seed(123)

# determine the number of clusters
k_max <- 10

# calculate the total within sum of squares
twcss <- sapply(1:k_max, function(k){kmeans(boston_std, k)$tot.withinss})

# visualize the results
qplot(x = 1:k_max, y = twcss, geom = 'line')

Based on graph above we decide to use two clusters.

# K-means clustering
km <-kmeans(boston_std, centers = 2)



# plot the Boston dataset with clusters
#pairs(Boston, col = km$cluster)
class(km$cluster)
## [1] "integer"
ggpairs(boston_std, mapping = aes(col = as.factor(km$cluster), alpha = 0.3), lower = "blank", upper = list(continuous = "points", combo =
  "facethist", discrete = "facetbar", na = "na"))

From the plot above we see that k-means clustering works quite well, in most cases those two clusters are enough to seperate the data to two resonable distributions. For example, in the case of nitrogen oxides concentration (nox) we see quite clearly two different distributions.

Super Bonus

model_predictors <- dplyr::select(train, -crime)

# check the dimensions

dim(model_predictors)
## [1] 404  13
dim(lda.fit$scaling)
## [1] 13  3
# matrix multiplication

matrix_product <- as.matrix(model_predictors) %*% lda.fit$scaling
matrix_product <- as.data.frame(matrix_product)


plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = train$crime)
# Let's find k-clusters for train set

n_k <- nrow(Boston)

ind_k <- sample(n_k, size = n_k*0.8)


train_k <- Boston[ind_k,]

train_k <- scale(train_k)


twcss2 <- sapply(1:k_max, function(k){kmeans(stats::na.omit(train_k), k)$tot.withinss})

# visualize the results
qplot(x = 1:k_max, y = twcss2, geom = 'line') # 2

# k-means clustering
km_k <-kmeans(train_k, centers = 2)

plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km_k$cluster)
# Let's increase the number of clusters to 4

km_l <-kmeans(train_k, centers = 4)

plot_ly(x = matrix_product$LD1, y = matrix_product$LD2, z = matrix_product$LD3, type= 'scatter3d', mode='markers', color = km_l$cluster)

In this part we can compare the 3D scatterplots where base vectors are from LDA model made above. All the points in these three 3D scatterplots are in the same cordinates, only the colours differ.The first 3D scatterplot is colored with four crime rates, the second with 2-mean clustering method and the third with 4-mean clustering method. Colours distributions in k-mean clustering plots do not respond the colour distibution of crime rate distribution. It might be that criminal rate does not have as high weight as some other variables in the sense of clustering because it has relatively slight correlation with other variables (this can be seen above).